<?php
/**
php微版浏览器　爬虫必备
author: Yangjifang
date:2016-11-28
根据http协议实现部分自动化功能 设置cookie 识别部分状态码

初次访问　页面　
获取ｒｅｓｐｏｎｓｅ
从ｒｅｓｐｏｎｓｅ中获取要做的　如果
参数　ｕｒｌ地址　获得最终的内容　可能是图片　可能是文本

*/



class Explore{
	
	/**
	* @var Array
	*/
	private $init_header = array(
		array("key"=>"User-Agent","val"=>"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0"),
		array("key"=>"Connection","val"=>"keep-alive")
	);

	/**
	*@var cookie单独处理
	*/
	private $cookie = array();
	
	/**
	* 生成后可以用的ｃｏｏｋｉｅ
	*@var String
	*/
	private $scookie = '';
	/**
	*@var String
	*/
	private $header = '';
	
	/**
	*@var String
	*/
	private $request = '';
	
	/**
	*@var String
	*/
	private $response = '';

	/**
	*分析返回的头得到的头部信息
	*@var Array	
	*/
	private $response_header = array();
	
	/**
	*@var String
	*/
	private $url = '';

	/**
	* @var String
	*/
	private $header_file_path = '';


	public function __construct($url = ''){
		$this->header_file_path = './header.txt';		
		$this->url = $url;
	}	

	private function gen_cmd(){
		$this->gen_header();
		$this->cmd = 'curl -s -D '.$this->header_file_path.' \''.$this->url.'\' '.$this->header.' --compressed';			
	}

	private function gen_request(){
		$this->gen_header();
	}

	private function gen_header(){
		foreach($this->init_header as $key => $val ){
			$this->header .= $this->add_header($val);
		}
		$this->gen_cookie();
		if($this->scookie){
			$this->add_header(array("key"=>"Cookie","val"=>"{$this->scookie}"));	
		}
	}	
	
	private function  add_header(Array $header){
		return " -H '".$header["key"].":".$header["val"]."' ";
	}


	/**
	*根据分析获得的ｃｏｏｋｉｅ数组　生成新的ｃｏｏｋｉｅ以便访问
	*/	
	private function gen_cookie(){
		$scookie = '';
		//var_dump($this->cookie);
		if($this->cookie){
			foreach($this->cookie as $key=> $val ){
				$scookie .= " {$key}={$val};";
			}
		}	
		$this->scookie = trim(trim($scookie),';');
	}

	
	public function explore(){
		$this->gen_cmd();
		$this->response = shell_exec($this->cmd);	
		echo $this->response;
		$this->parse_response();
	}	

	 private function parse_response(){
		$this->parse_header_file();
		//先设置ｃｏｏｋｉｅ
		$this->parse_cookie();	

		//有location 就跳转
		if(isset($this->response_header['Location'])){
			$this->url = $this->response_header['Location'];
			$this->explore();
		}		
	}

	/**
	*分析返回的头信息　主要是处理ｃｏｏｋｉｅ　和　一些跳转之类的
	*/
	private function parse_header_file(){
		$content = file($this->header_file_path);
		$this->response_header = array();
		$tmp = array();
		foreach($content as $key=>$val){
			if(preg_match("/(^[^:]+):(.+)/",$val,$matches)){
				//array_push($tmp,array(trim($matches[1])=>trim($matches[2])));	
				if(trim($matches[1]) == 'Set-Cookie'){
					
					$this->response_header[trim($matches[1])] =trim(trim(isset($this->response_header[trim($matches[1])])?$this->response_header['Set-Cookie']:""),";")."; {$matches[2]}";
				}else{
					$this->response_header[trim($matches[1])] = trim($matches[2]);
				}
			}
		}
	}
	

	private function parse_cookie(){
		$cookie = $this->response_header['Set-Cookie'];
		if(trim($cookie)){
			$res = explode(";",$cookie);
		}
		if($res){
			foreach($res as $key => $val){
				$this->parse_key_val($val);		
			}
		}
	} 
	/**
	*分析cookie的键值对
	*/	
	private function parse_key_val($equition){
		$res = explode("=",$equition);
		if(isset($res[1]))
			$this->cookie[trim($res[0])] = trim($res[1]);	
		//else 
		//	var_dump($res);
	}
	
		
	public function print_cmd(){
		echo $this->cmd;
	}
	public function print_response(){
		echo $this->response;
	}
}

$explore = new Explore("http://weixin.sogou.com/weixin?type=1&query=instachina&ie=utf8&_sug_=n&_sug_type_=");
$explore -> explore();
//$explore -> print_response();







